{ "cells": [ { "cell_type": "markdown", "id": "92f5ec54", "metadata": {}, "source": [ "## String functions on Columns" ] }, { "cell_type": "markdown", "id": "877b8bc5", "metadata": {}, "source": [ "### EDA for columns" ] }, { "cell_type": "code", "execution_count": 1, "id": "06e8f3f1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 3, "id": "373789d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>First Name</th>\n", " <th>Age</th>\n", " <th>Gender</th>\n", " <th>City</th>\n", " <th>Place of Work</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Sahil</td>\n", " <td>10</td>\n", " <td>M</td>\n", " <td>J</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Sonia</td>\n", " <td>20</td>\n", " <td>F</td>\n", " <td>K</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Sourav</td>\n", " <td>30</td>\n", " <td>M</td>\n", " <td>L</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Vishal</td>\n", " <td>40</td>\n", " <td>M</td>\n", " <td>P</td>\n", " <td>True</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " First Name Age Gender City Place of Work\n", "0 Sahil 10 M J True\n", "1 Sonia 20 F K False\n", "2 Sourav 30 M L False\n", "3 Vishal 40 M P True" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df=pd.DataFrame({\n", "'First Name':['Sahil','Sonia','Sourav','Vishal'],\n", "'Age':[10,20,30,40],\n", "'Gender':['M','F','M','M'],\n", "'City':['J','K','L','P'],\n", "'Place of Work':[True,False,False,True],\n", "}\n", ")\n", "df" ] }, { "cell_type": "markdown", "id": "38e34482", "metadata": {}, "source": [ "#### Get columns as list" ] }, { "cell_type": "code", "execution_count": 4, "id": "e4d76140", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['First Name', 'Age', 'Gender', 'City', 'Place of Work']" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.tolist()" ] }, { "cell_type": "markdown", "id": "102fad7d", "metadata": {}, "source": [ "#### Convert column names to series | df:" ] }, { "cell_type": "code", "execution_count": 9, "id": "ffe89310", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "First Name First Name\n", "Age Age\n", "Gender Gender\n", "City City\n", "Place of Work Place of Work\n", "dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.to_series()" ] }, { "cell_type": "code", "execution_count": 11, "id": "f8870e68", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>First Name</th>\n", " <td>First Name</td>\n", " </tr>\n", " <tr>\n", " <th>Age</th>\n", " <td>Age</td>\n", " </tr>\n", " <tr>\n", " <th>Gender</th>\n", " <td>Gender</td>\n", " </tr>\n", " <tr>\n", " <th>City</th>\n", " <td>City</td>\n", " </tr>\n", " <tr>\n", " <th>Place of Work</th>\n", " <td>Place of Work</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0\n", "First Name First Name\n", "Age Age\n", "Gender Gender\n", "City City\n", "Place of Work Place of Work" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.to_frame()" ] }, { "cell_type": "markdown", "id": "d794ed40", "metadata": {}, "source": [ "#### Check if specific column is there or not" ] }, { "cell_type": "code", "execution_count": 15, "id": "939a91ff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, False, False, False, False])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.str.contains('Name') " ] }, { "cell_type": "markdown", "id": "4d7d7886", "metadata": {}, "source": [ "#### Check if any duplicate column is there" ] }, { "cell_type": "code", "execution_count": 17, "id": "1d5b6490", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([False, False, False, False, False])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.duplicated()" ] }, { "cell_type": "markdown", "id": "55079b22", "metadata": {}, "source": [ "#### Check methods/attributes of String" ] }, { "cell_type": "code", "execution_count": 20, "id": "12107440", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['__annotations__', '__class__', '__delattr__', '__dict__', '__dir__']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dir(df.columns.str)[0:5]" ] }, { "cell_type": "markdown", "id": "e185987b", "metadata": {}, "source": [ "#### Make column names to lower case" ] }, { "cell_type": "code", "execution_count": 22, "id": "9b96b260", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['first name', 'age', 'gender', 'city', 'place of work'], dtype='object')" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.str.lower()" ] }, { "cell_type": "markdown", "id": "f34c789f", "metadata": {}, "source": [ "#### Make column names to Upper case" ] }, { "cell_type": "code", "execution_count": 23, "id": "e74ca52a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['FIRST NAME', 'AGE', 'GENDER', 'CITY', 'PLACE OF WORK'], dtype='object')" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.str.upper()" ] }, { "cell_type": "markdown", "id": "f41bfa14", "metadata": {}, "source": [ "#### Make column names to Title case" ] }, { "cell_type": "code", "execution_count": 26, "id": "b1c5a382", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['First Name', 'Age', 'Gender', 'City', 'Place Of Work'], dtype='object')" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.str.title() # Camel Case" ] }, { "cell_type": "markdown", "id": "bce613bd", "metadata": {}, "source": [ "#### Make column names to Capitalize" ] }, { "cell_type": "code", "execution_count": 28, "id": "42cce642", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['First name', 'Age', 'Gender', 'City', 'Place of work'], dtype='object')" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.str.capitalize() # Only first letter big" ] }, { "cell_type": "markdown", "id": "3367e158", "metadata": {}, "source": [ "#### Replace empty spaces with underscores" ] }, { "cell_type": "code", "execution_count": 29, "id": "ce8a4321", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['First-Name', 'Age', 'Gender', 'City', 'Place-of-Work'], dtype='object')" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.str.replace(' ','-')" ] }, { "cell_type": "markdown", "id": "3ce3d474", "metadata": {}, "source": [ "#### Rename columns" ] }, { "cell_type": "code", "execution_count": 30, "id": "f1c0b479", "metadata": {}, "outputs": [], "source": [ "df.rename(columns={'oldname':'newname'},inplace=True)" ] }, { "cell_type": "markdown", "id": "17232ae6", "metadata": {}, "source": [ "#### Check total number of columns" ] }, { "cell_type": "code", "execution_count": 32, "id": "fb541c61", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df.columns)" ] }, { "cell_type": "markdown", "id": "4448982c", "metadata": {}, "source": [ "#### Select particular columns" ] }, { "cell_type": "code", "execution_count": 33, "id": "8c25c05f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['First Name', 'Age', 'Gender', 'City'], dtype=object)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.values[0:4]" ] }, { "cell_type": "markdown", "id": "a271a055", "metadata": {}, "source": [ "#### Get 2nd column and rename it" ] }, { "cell_type": "code", "execution_count": 34, "id": "80ddb35d", "metadata": {}, "outputs": [], "source": [ "df.columns.values[2]='DOB'" ] }, { "cell_type": "code", "execution_count": 35, "id": "16c3bccc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>First Name</th>\n", " <th>Age</th>\n", " <th>DOB</th>\n", " <th>City</th>\n", " <th>Place of Work</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Sahil</td>\n", " <td>10</td>\n", " <td>M</td>\n", " <td>J</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Sonia</td>\n", " <td>20</td>\n", " <td>F</td>\n", " <td>K</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Sourav</td>\n", " <td>30</td>\n", " <td>M</td>\n", " <td>L</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Vishal</td>\n", " <td>40</td>\n", " <td>M</td>\n", " <td>P</td>\n", " <td>True</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " First Name Age DOB City Place of Work\n", "0 Sahil 10 M J True\n", "1 Sonia 20 F K False\n", "2 Sourav 30 M L False\n", "3 Vishal 40 M P True" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "e9eea57a", "metadata": {}, "source": [ "#### Select all columns except one" ] }, { "cell_type": "code", "execution_count": 40, "id": "0cc2184f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['First Name', 'Age', 'City', 'Place of Work'], dtype='object')" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns[df.columns!= 'DOB']" ] }, { "cell_type": "code", "execution_count": 41, "id": "04f9d131", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>First Name</th>\n", " <th>Age</th>\n", " <th>DOB</th>\n", " <th>City</th>\n", " <th>Place of Work</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Sahil</td>\n", " <td>10</td>\n", " <td>M</td>\n", " <td>J</td>\n", " <td>True</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Sonia</td>\n", " <td>20</td>\n", " <td>F</td>\n", " <td>K</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Sourav</td>\n", " <td>30</td>\n", " <td>M</td>\n", " <td>L</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Vishal</td>\n", " <td>40</td>\n", " <td>M</td>\n", " <td>P</td>\n", " <td>True</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " First Name Age DOB City Place of Work\n", "0 Sahil 10 M J True\n", "1 Sonia 20 F K False\n", "2 Sourav 30 M L False\n", "3 Vishal 40 M P True" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "d41853db", "metadata": {}, "source": [ "#### Select all columns except multiple" ] }, { "cell_type": "code", "execution_count": 43, "id": "06ca1e33", "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'numpy.ndarray' object has no attribute 'columns'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m<ipython-input-43-3cca24522c75>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'DOB'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'City'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'columns'" ] } ], "source": [ "#?\n", "df.loc[:,-df.columns.isin(['DOB','City']).columns]" ] }, { "cell_type": "markdown", "id": "1d592bf9", "metadata": {}, "source": [ "#### Select column names that begins with particular word" ] }, { "cell_type": "code", "execution_count": 48, "id": "a71af597", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, False, False, False, False])" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.str.startswith('First')\n", "# Gives array of booleans" ] }, { "cell_type": "markdown", "id": "ca67d9aa", "metadata": {}, "source": [ "#### Select group of column names" ] }, { "cell_type": "code", "execution_count": 46, "id": "9d6ffb19", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['First Name', 'Age', 'DOB'], dtype=object)" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns.values[[0,1,2]]" ] }, { "cell_type": "code", "execution_count": 47, "id": "71ea54bb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['First Name', 'Age', 'DOB'], dtype='object')" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns[0:3]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }